package org.hackreduce.storm.example.wikipedia; import backtype.storm.Config; import backtype.storm.StormSubmitter; import backtype.storm.generated.AlreadyAliveException; import backtype.storm.generated.InvalidTopologyException; import backtype.storm.spout.SchemeAsMultiScheme; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.TopologyBuilder; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Tuple; import com.google.common.base.Joiner; import org.hackreduce.storm.HackReduceStormSubmitter; import org.hackreduce.storm.example.common.Common; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.kafka.KafkaSpout; import storm.kafka.SpoutConfig; import storm.kafka.StringScheme; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.hackreduce.storm.HackReduceStormSubmitter.teamPrefix; public class LinkLogger { public static class LinkExtractorBolt extends BaseRichBolt { private static Logger LOG = LoggerFactory.getLogger(LinkExtractorBolt.class); private static final int MAX_LINKS = 10; private static final Pattern TITLE_REGEX = Pattern.compile("<title>(.+?)</title>"); private static final Pattern LINK_REGEX = Pattern.compile("\\[\\[(.+?)\\]\\]"); private OutputCollector collector; @Override public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) { this.collector = outputCollector; } @Override public void execute(Tuple tuple) { String content = tuple.getString(0); // Get title Matcher titleMatcher = TITLE_REGEX.matcher(content); String title = (titleMatcher.find()) ? titleMatcher.group(1) : "<unknown>"; // Get links Matcher matcher = LINK_REGEX.matcher(content); List<String> links = new ArrayList<String>(); while (matcher.find() && links.size() < MAX_LINKS) { try { String linkTarget = matcher.group(1).split("\\|")[0]; links.add(linkTarget); } catch (ArrayIndexOutOfBoundsException aioobe) { LOG.warn("Couldn't parse link: " + matcher.group(1)); } } LOG.info("Found {} links on page '{}': ({})", new Object[]{ links.size(), title, Joiner.on(", ").join(links) } ); collector.ack(tuple); } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { // This bolt provides no output } } public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException { TopologyBuilder builder = new TopologyBuilder(); // Configuration Config config = new Config(); SpoutConfig spoutConfig = new SpoutConfig( Common.getKafkaHosts(), "wikipedia_articles", "/kafkastorm", teamPrefix("wikipedia-state") ); spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()); // This tells the spout to start at the very beginning of the data stream // If you just want to resume where you left off, remove this line spoutConfig.forceStartOffsetTime(-2); builder.setSpout("articles", new KafkaSpout(spoutConfig)); builder.setBolt("link-logger", new LinkExtractorBolt()) .shuffleGrouping("articles"); // Launch HackReduceStormSubmitter.submitTopology("wikipedia-logger", config, builder.createTopology()); } }